import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime

Credit_Control_df = pd.read_csv(r"C:\Users\jki\Desktop\Data Scence Projects\Credit Control Debtors Aging Report\Machine Learning\Source Data\Credit Control Report.csv" ) 
Credit_Control_df.head(5)

Credit_Control_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Cutomer ID      178 non-null    object
 1   Customer  Name  178 non-null    object
 2   Invoice Number  178 non-null    object
 3   Amount          178 non-null    object
 4   Invoice Date    178 non-null    object
 5   Due Date        178 non-null    object
dtypes: object(6)
memory usage: 8.5+ KB

# Convert date columns to datetime
Credit_Control_df['Invoice Date'] = pd.to_datetime(Credit_Control_df['Invoice Date'])
Credit_Control_df['Due Date'] = pd.to_datetime(Credit_Control_df['Due Date'])

# Convert amount to numeric
Credit_Control_df['Amount'] =  pd.to_numeric(Credit_Control_df['Amount'],errors='coerce')

Credit_Control_df.info

<bound method DataFrame.info of     Cutomer ID                    Customer  Name Invoice Number  Amount  \
0       CA00040                      Hayes-Koepp     CI47849371     NaN   
1       CA00394                Aufderhar-Gaylord     UI98518744     NaN   
2       CA00419                 Witting and Sons     UI76967580     NaN   
3       CA00897  Williamson, Harvey and Lubowitz     CI30071955     NaN   
4       CA00271                  Zemlak-Anderson     CI88968287     NaN   
..          ...                              ...            ...     ...   
173     CA00350                          Rau Inc     UI78257094     NaN   
174     CA00854                  King-Wintheiser     CI83073426     NaN   
175     CA00989     Streich, Mueller and McClure     CI49062755     NaN   
176     CA00313      Vandervort, Huel and Wunsch     CI29236858     NaN   
177     CA00200                    Lehner-Harber     CI42174722     NaN   

    Invoice Date   Due Date  
0     2020-12-11 2021-01-10  
1     2020-12-04 2021-01-03  
2     2020-08-07 2020-09-06  
3     2020-12-20 2021-01-19  
4     2020-09-13 2020-10-13  
..           ...        ...  
173   2020-11-28 2020-12-28  
174   2020-12-19 2021-01-18  
175   2020-11-07 2020-12-07  
176   2020-11-30 2020-12-30  
177   2020-10-07 2020-11-06  

[178 rows x 6 columns]>

print(Credit_Control_df.isnull().sum())

Cutomer ID          0
Customer  Name      0
Invoice Number      0
Amount            178
Invoice Date        0
Due Date            0
dtype: int64

Credit_Control_df['Amount'].fillna(0, inplace=True)

print(Credit_Control_df.isnull().sum())

Cutomer ID        0
Customer  Name    0
Invoice Number    0
Amount            0
Invoice Date      0
Due Date          0
dtype: int64

# Calculate the payment delay
Credit_Control_df['payment_delay'] = (Credit_Control_df['Due Date'] - Credit_Control_df['Invoice Date']).dt.days

Credit_Control_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Cutomer ID      178 non-null    object        
 1   Customer  Name  178 non-null    object        
 2   Invoice Number  178 non-null    object        
 3   Amount          178 non-null    float64       
 4   Invoice Date    178 non-null    datetime64[ns]
 5   Due Date        178 non-null    datetime64[ns]
 6   payment_delay   178 non-null    int64         
dtypes: datetime64[ns](2), float64(1), int64(1), object(3)
memory usage: 9.9+ KB

# Group by company name and calculate metrics
payment_behavior = Credit_Control_df.groupby('Customer  Name').agg(
    total_amount=('Amount', 'sum'),
    average_delay=('payment_delay', 'mean'),
    count_invoices=('Invoice Number', 'count')
).reset_index()

import os
os.environ["OMP_NUM_THREADS"] = "1"

from sklearn.cluster import KMeans
import numpy as np

# Prepare data for clustering
X = payment_behavior[['average_delay', 'total_amount']]

# Perform KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
payment_behavior['cluster'] = kmeans.labels_

C:\Users\jki\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
C:\Users\jki\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\jki\AppData\Local\Temp\ipykernel_13172\108334237.py:11: ConvergenceWarning: Number of distinct clusters (1) found smaller than n_clusters (3). Possibly due to duplicate points in X.
  kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

import json

# Suggest extending due dates for debtors in the cluster with the highest average delay
payment_behavior['suggested_due_date_extension'] = np.where(
    payment_behavior['cluster'] == payment_behavior['cluster'].max(),
    payment_behavior['average_delay'] + 10,  # Extend by 10 days
    payment_behavior['average_delay']
)

# Display the final result
print(payment_behavior)

# Save the recommendations to a JSON file
output_json_path = 'vendor_recommendations.json'
payment_behavior.to_json(output_json_path, orient='records', lines=True)

print(f"Vendor recommendations saved to: {output_json_path}")

                  Customer  Name  total_amount  average_delay  count_invoices  \
0   Abbott, Dibbert and Prohaska           0.0           30.0               1   
1                 Abshire-Abbott           0.0           30.0               1   
2     Armstrong, Rau and Quigley           0.0           30.0               1   
3              Aufderhar-Gaylord           0.0           30.0               2   
4   Bartoletti, Ledner and Davis           0.0           30.0               2   
..                           ...           ...            ...             ...   
71                 Willms-Brekke           0.0           30.0               2   
72              Witting and Sons           0.0           30.0               5   
73       Wolf, Becker and Brekke           0.0           30.0               2   
74                   Wyman Group           0.0           30.0               1   
75               Zemlak-Anderson           0.0           30.0               2   

    cluster  suggested_due_date_extension  
0         0                          40.0  
1         0                          40.0  
2         0                          40.0  
3         0                          40.0  
4         0                          40.0  
..      ...                           ...  
71        0                          40.0  
72        0                          40.0  
73        0                          40.0  
74        0                          40.0  
75        0                          40.0  

[76 rows x 6 columns]
Vendor recommendations saved to: vendor_recommendations.json

Credit Risk Analysis Debtors Accounts¶

Step 1: Import Libraries and Load the Data: Use pandas to load the data from the file.¶

Step 2 : Data Preprocessing:¶

Step 3 : Feature Engineering¶

Step 4 : Analyze Payment Behavior¶

Step 5 : Cluster Analysis¶

Step 6 Determine Better Due Dates¶

	Cutomer ID	Customer Name	Invoice Number	Amount	Invoice Date	Due Date
0	CA00040	Hayes-Koepp	CI47849371	5,506.32	12/11/2020	1/10/2021
1	CA00394	Aufderhar-Gaylord	UI98518744	7,882.92	12/4/2020	1/3/2021
2	CA00419	Witting and Sons	UI76967580	8,081.44	8/7/2020	9/6/2020
3	CA00897	Williamson, Harvey and Lubowitz	CI30071955	9,003.04	12/20/2020	1/19/2021
4	CA00271	Zemlak-Anderson	CI88968287	9,257.06	9/13/2020	10/13/2020